#### Visual Analytics Coursework ####
# Import the Required Modules
import time
start = time.time()
# Set matplotlib to plot in the notebook
%pylab inline
import pandas as pd
import numpy as np
from __future__ import division
import utils
import seaborn
from six.moves import zip
from sklearn import preprocessing
# Set the Default Seaborn Colours
seaborn.set()
colors = seaborn.color_palette()
seaborn.set_context(rc={"figure.figsize": (12, 12)})
# Turn off Pandas Future Warnings
pd.set_option('chained_assignment',None)
from collections import OrderedDict
import numpy as np
from bokeh.charts import Histogram
from bokeh.plotting import *
output_notebook()
# Import the Data - Using Pandas
Data_CF = pd.read_csv('Crossfit_Open_2011_Dataset.csv',
sep= ',')
# Check the Import for Errors
print(Data_CF.head(5))
# Get a List of Column Headers for Reference
Column_Names = Data_CF.columns
# Print Column names into a List
[x for x in Column_Names]
# Get a subset of the Data Set of Features for Analysis
Data_CF_VA = Data_CF[['athlete_ID',
'First_Name',
'Last_Name',
'Region',
'age',
'Gender',
'Height_cm',
'Weight_kg',
' overall-points',
'overall-rank',
' score1',
'rank1',
' score2',
' rank2',
' score3',
'rank3',
' score4',
'rank4',
' score5',
'rank5',
' score6',
'rank6']]
# Get some Summary Statistics of the Data
Data_CF_VA.describe().T
# Plot each feature on a history gram - Phase 1 of Methodology
Data_CF_VA_Columns = Data_CF_VA.columns
# Pandas Histogram Plots - Height (Transformed Variable)
Data_CF_VA['Height_cm'].hist(bins=100);
plt.title('Height Distribution');
plt.ylabel('Frequency');
plt.xlabel('Height (cm)');
plt.show()
# Clearly Outliers Exists - Accomadation for this will need to be applied
# Pandas Histogram Plots - Age
Data_CF_VA['age'].hist(bins=24)
plt.title('Age Distribution'); plt.ylabel('Frequency'); plt.xlabel('Age'); plt.show()
# Pandas Histogram Plots - Weight_kg
Data_CF_VA['Weight_kg'].hist(bins=100);
plt.title('Weight (kg) Distribution');
plt.ylabel('Frequency');
plt.xlabel('Weight (kilograms)');
plt.show()
# Pandas Histogram Plots - Overall Points
Data_CF_VA[Data_CF_VA_Columns[8]].hist(bins=100);
plt.title('Final Points Distribution');
plt.ylabel('Frequency');
plt.xlabel('Points (binned 50 into groups)');
plt.show()
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Pandas Histogram Plots - Score After Week 1
Data_CF_VA[' score1'].hist(bins=50); plt.title('Week 1 Points Distribution');
plt.ylabel('Frequency');
plt.xlabel('Points (binned 50 into groups)');
plt.show()
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Pandas Histogram Plots - Score After Week 2
Data_CF_VA[' score2'].hist(bins=50);
plt.title('Week 2 Points Distribution');
plt.ylabel('Frequency');
plt.xlabel('Points (binned 50 into groups)');
plt.show()
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Pandas Histogram Plots - Score After Week 3
Data_CF_VA[' score3'].hist(bins=50);
plt.title('Week 3 Points Distribution');
plt.ylabel('Frequency');
plt.xlabel('Points (binned 50 into groups)');
plt.show()
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Pandas Histogram Plots - Score After Week 4
Data_CF_VA[' score4'].hist(bins=50);
plt.title('Week 4 Points Distribution');
plt.ylabel('Frequency');
plt.xlabel('Points (binned 50 into groups)');
plt.show()
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Pandas Histogram Plots - Score After Week 5
Data_CF_VA[' score5'].hist(bins=50);
plt.title('Week 5 Points Distribution');
plt.ylabel('Frequency');
plt.xlabel('Points (binned 50 into groups)');
plt.show()
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Pandas Histogram Plots - Score After Week 5.1
Data_CF_VA[' score6'].hist(bins=50);
plt.title('Week 5.1 Points Distribution');
plt.ylabel('Frequency');
plt.xlabel('Points (binned 50 into groups)');
plt.show()
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Phase 1 Conclusions
#
# Age - The histogram plot conveys a distribution that I would expect, whereby its heavily dominated by the number of
# athletes in the age of 20-40.
# No requirement to normalise the data yet or outlier removal since the data is distributed as expected.
#
# Height - The histogram plot clearly indentifies a number of athletes that have erroneous/incorrect values for their height.
# Outliers will need to be removed and then the plot will need to be reassesed
# Transform in the form of normalisation or standardisation may be required.
#
# Weight - The plot appears to be well distributed amoungst the bins, when changeing the number of bins it can be noted that
# there are some outliers - removal required since they appear to be dubious. <50kg and >160kg
#
# Overall Points - There appears to be some clear distictions in the distbution
# 1 - A high peak and clear group can be seen where the number of points <500
# 2 - Another group from about 500-3000 points
# 3 - The final set where the number of points >3000
#
# Week 1-5 Scores - The score plots consider all possible values.
# When considering and comparing groups consideration of the missing values, where the score is 0, need to be
# accounted for.
# The number of individuals entering appears to be decreasing from Week 1 to Week 5.
# Further analysis will be need to understand if this really is the case
# Phase 1.1 - Adjustments to Features based on the Conclusions mentioned above
# Dealing with Height
# Height - The tallest man alive is 251cm - Sultan Kösen (Turkey, b.10 December 1982)
# http://www.guinnessworldrecords.com/world-records/tallest-man-living
Height_Rule = Data_CF_VA.Height_cm < 251
Data_CF_VA['Height_Gr_251'] = Data_CF_VA.Height_cm < 251
# Frequency TABLE
pd.Series(Data_CF_VA['Height_Gr_251']).value_counts()
# The output below indicate the number of usable instances where the weight can be included as part of the analysis
# Define a function to create the central tendency about a CI
def confid_int_plot(point, ci, y, color, label):
plot(ci, [y, y], "-", color=color, linewidth=4, label=label)
plot(point, y, "o", color=color, markersize=10)
# Position on the y-axis where the INterval will be plotted
int_y = 500
# Calculate some statistics
d = Data_CF_VA[Data_CF_VA.Height_cm < 251].Height_cm
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d.values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Height Distribution')
plt.ylabel('Frequency')
plt.xlabel('Height (cm)')
plt.legend(loc="best")
plt.show()
# Adjustment to the Weight Feature to Remove the Erroneous Values
# Calculate some statistics
d = Data_CF_VA.Weight_kg
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('Frequency')
plt.xlabel('Weight (kg)')
plt.legend(loc="best")
plt.show()
# Get the scores into a seperate Dataframe
Data_CF_VA_Scores = Data_CF_VA[[' score1',
' score2',
' score3',
' score4',
' score5',
' score6']]
# Convert all the -1 to NAN values
Data_CF_VA_Scores[Data_CF_VA_Scores == -1] = np.nan
# Show Participation for Each Score Submitted
Number_Null = []; Number_Non_Null = []; DF_Length = Data_CF_VA_Scores.shape[0]
# Loop through each column to count values
for col in Data_CF_VA_Scores:
Values = Data_CF_VA_Scores[col].count()
Number_Non_Null.append(Values)
Number_Null.append(DF_Length - Values)
# Convert List to Tuple for Plotting
def totuple(a):
try:
return tuple(totuple(i) for i in a)
except TypeError:
return a
# Convert Ranges to Tuples
Number_Non_Null = totuple(Number_Non_Null)
Number_Null = totuple(Number_Null)
ind = np.arange(len(Number_Non_Null)) # the x locations for the groups
width = 0.35 # the width of the bars: can also be len(x) sequence
# Plot Each Type
p1 = plt.bar(ind, Number_Non_Null, width, color='g')
p2 = plt.bar(ind, Number_Null, width, color='y',bottom=Number_Non_Null)
# Annotate the Chart
plt.ylabel('Number of Athletes Participating')
plt.title('Participation per Week')
plt.xticks(ind+width/2., ('Week1', 'Week2', 'Week3', 'Week4', 'Week5', 'Week6') )
plt.legend( (p1[0], p2[0]), ('Score Submitted', 'No Score Submitted'), loc="best")
plt.show()
# There appears to be a consistent drop off in the number of scores submitted - investigate this in terms of
# Percentage change
# Calculate Percentage Changes per Week
def percent_change(old, new):
change = new - old
percentage_change = (change / float(old))
return percentage_change * 100
Number_Non_Null_Change = []
for i in ind:
if i+1 > ind.max():
break
else:
Number_Non_Null_Change.append(percent_change(Number_Non_Null[i], Number_Non_Null[i+1]))
Number_Non_Null_Change = totuple(Number_Non_Null_Change)
# Plot Each Type
p1 = plt.bar(np.arange(len(Number_Non_Null_Change)), Number_Non_Null_Change, width, color='r')
# Annotate the Chart
plt.ylabel('% Change of the Number of Athletes Participating')
plt.ylim( (-20, 20) )
plt.title('Participation per Week')
plt.xticks(ind+width/2., ('Week1 - Week2', 'Week2 - Week3', 'Week3 - Week4', 'Week4 - Week5', 'Week5 - Week6'), rotation=45)
plt.legend(labels = 'Week on Week Change')
plt.show()
# The biggest drops between weeks occurred between 2-3, 4-5 and 5-6.
# Revisulise the with the Null Values Excluded
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Calculate some statistics
d = Data_CF_VA_Scores[' score1']
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('Frequency');
plt.xlabel('Points (binned 50 into groups)');
plt.legend(loc="best")
plt.show()
# No adjustment since the distribution of the data in the majority is within the CI
# Revisulise the with the Null Values Excluded
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# From the Previous visulisation it can be seen that there exist some outliers - the Green bar indicates a Confidence
# of 4 Standard deviations from the mean
# Calculate some statistics
d = Data_CF_VA_Scores[' score2']
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('Frequency');
plt.xlabel('Points (binned 50 into groups)');
plt.legend(loc="best")
plt.show()
# I will remove those outside this range and re-plot
Score_2_Rule = (d <= m + 4*s) & (d >= m - 4*s)
# Subset using the rule above
d = d[Score_2_Rule]
# Calculate some statistics
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('Frequency');
plt.xlabel('Points (binned 50 into groups)');
plt.legend(loc="best")
plt.show()
# The resulting Plots now corrects alot of the observed problems of the dataset
# Revisulise the with the Null Values Excluded
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Calculate some statistics
d = Data_CF_VA_Scores[' score3']
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('Frequency');
plt.xlabel('Points (binned 50 into groups)');
plt.legend(loc="best")
plt.show()
# No adjustment since the distribution of the data in the majority is within the CI
# Revisulise the with the Null Values Excluded
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Calculate some statistics
d = Data_CF_VA_Scores[' score4']
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('Frequency');
plt.xlabel('Points (binned 50 into groups)');
plt.legend(loc="best")
plt.show()
# No adjustment since the distribution of the data in the majority is within the CI
# Where the data is not - it will be excused as excluding them it would mean that the "Top" Scores would be removed
# Revisulise the with the Null Values Excluded
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Calculate some statistics
d = Data_CF_VA_Scores[' score5']
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('Frequency');
plt.xlabel('Points (binned 50 into groups)');
plt.legend(loc="best")
plt.show()
# No adjustment since the distribution of the data in the majority is within the CI
# Revisulise the with the Null Values Excluded
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Calculate some statistics
d = Data_CF_VA_Scores[' score6']
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('Frequency');
plt.xlabel('Points (binned 50 into groups)');
plt.legend(loc="best")
plt.show()
# No adjustment since the distribution of the data in the majority is within the CI
# Apply All of the Changes to the Dataset and obtain Summary Statistics
# An Overview of Changes to Apply to the Main Dataset - Data_CF_VA
# 1 - Change -1 values to NAN
# 2 - Apply the Score2 Rule to exclude the outliers
# 3 - Remove the Erroneous Height Values
# 4 - Remove NAN Overall Points
# 1 - Change -1 values to NAN
for name in Data_CF_VA_Columns:
# Replace only those columns that contain score in the label
if name.startswith(' score'):
Data_CF_VA[name].replace('-1', value = np.nan, inplace = True)
# 2 - Apply the Score2 Rule to exclude the outliers
# Calculate some statistics
d = Data_CF_VA[' score2']
m = d.mean()
s = d.std()
# Rule to Remove +- 4 Standard Deviations from the Dataset
Score_2_Rule = (d <= m + 4*s) & (d >= m - 4*s)
# Exclude values from the Dataset
Data_CF_VA = Data_CF_VA[Score_2_Rule]
# 3 - Remove the Erroneous Height Values
Height_Rule = (Data_CF_VA.Height_cm < 251)
# Exclude values from the Dataset
Data_CF_VA = Data_CF_VA[Height_Rule]
# 4 - Remove NAN Overall Points
Rule4 = pd.notnull(Data_CF_VA[' overall-points'])
Data_CF_VA = Data_CF_VA[Rule4]
# 5 - Remove nan Weight Values
Rule5 = Data_CF_VA['Weight_kg'] >= 0
Data_CF_VA = Data_CF_VA[Rule5]
# Remaining Dataset Size after Outlier and Spurious Data values
print('Remaining Dataset Size\n\nNumber of Rows: %d\nNumber of Features: %d') % (Data_CF_VA.shape[0], Data_CF_VA.shape[1])
# PHASE 2 - From Methodology
# Investigate relationships between Features
# All split by Gender - Splitting them by Gender
Data_CF_VA.Gender.value_counts()
# Proposed Investigation
# Score - Age
# Score - Overall Rank
# Score - Weight
# Region - Score - Box plots
# Weight - Height - Coloured by Overall Rank
# Get Male and Female Datasets
Data_CF_VA_Male = Data_CF_VA[Data_CF_VA.Gender == 'M']
Data_CF_VA_Female = Data_CF_VA[Data_CF_VA.Gender == 'F']
# Define Colours - Female
colorField = Data_CF_VA_Female[' overall-points'].as_matrix()
cm = np.array(["#C7E9B4", "#7FCDBB", "#41B6C4", "#1D91C0", "#225EA8", "#0C2C84"])
ix = ((colorField-colorField.min())/(colorField.max()-colorField.min())*5).astype('int')
colorsF = cm[ix]
# Define Colours - Male
colorField = Data_CF_VA_Female[' overall-points'].as_matrix()
cm = np.array(["#C7E9B4", "#7FCDBB", "#41B6C4", "#1D91C0", "#225EA8", "#0C2C84"])
ix = ((colorField-colorField.min())/(colorField.max()-colorField.min())*5).astype('int')
colorsM = cm[ix]
# Scatter Plots -> Score - Age
# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male['age'].astype(int).as_matrix()
TOOLS="resize,crosshair,pan,wheel_zoom,box_zoom,reset,tap,previewsave,box_select,poly_select,lasso_select"
# Create a figure
p1 = figure(tools=TOOLS,
title="Overall Points v Age - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p1.scatter(x,
y,
fill_color=colorsM,
fill_alpha=0.6,
line_color=None)
# Show plot in Ipython Notebook
show(p1)
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female['age'].astype(int).as_matrix()
# Create a figure
p2 = figure(tools=TOOLS,
title="Overall Points v Age - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p2.scatter(x,
y,
fill_color=colorsF,
fill_alpha=0.6,
line_color=None)
# Show plot in Ipython Notebook
show(p2)
# Scatter Plots -> Score - Overall Rank
# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score1'].astype(int).as_matrix()
# Create a figure
p3 = figure(tools=TOOLS,
title="Overall Points v Score1 - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p3.scatter(x,
y,
fill_color=colorsM,
fill_alpha=0.6,
line_color=None)
# Show plot in Ipython Notebook
show(p3)
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score1'].astype(int).as_matrix()
# Create a figure
p4 = figure(tools=TOOLS,
title="Overall Points v Score1 - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p4.scatter(x,
y,
fill_color=colorsF,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p4)
# Scatter Plots -> Score - Weight
# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score2'].astype(int).as_matrix()
# Create a figure
p5 = figure(tools=TOOLS,
title="Overall Points v Score2 - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p5.scatter(x,
y,
fill_color=colorsM,
fill_alpha=0.6,
line_color=None)
# Show plot in Ipython Notebook
show(p5)
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score2'].astype(int).as_matrix()
# Create a figure
p6 = figure(tools=TOOLS,
title="Overall Points v Score2 - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p6.scatter(x,
y,
fill_color=colorsF,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p6)
# Scatter Plots -> Score - Weight
# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score3'].astype(int).as_matrix()
# Create a figure
p7 = figure(tools=TOOLS,
title="Overall Points v Score3 - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p7.scatter(x,
y,
fill_color=colorsM,
fill_alpha=0.6,
line_color=None)
# Show plot in Ipython Notebook
show(p7)
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score3'].astype(int).as_matrix()
# Create a figure
p8 = figure(tools=TOOLS,
title="Overall Points v Score3 - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p8.scatter(x,
y,
fill_color=colorsF,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p8)
# Scatter Plots -> Score - Weight
# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score4'].astype(int).as_matrix()
# Create a figure
p9 = figure(tools=TOOLS,
title="Overall Points v Score4 - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p9.scatter(x,
y,
fill_color=colorsM,
fill_alpha=0.6,
line_color=None)
# Show plot in Ipython Notebook
show(p9)
# Scatter Plots -> Score - Weight
# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score5'].astype(int).as_matrix()
# Create a figure
p12 = figure(tools=TOOLS,
title="Overall Points v Score5 - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p12.scatter(x,
y,
fill_color=colorsM,
fill_alpha=0.6,
line_color=None)
# Show plot in Ipython Notebook
show(p12)
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score4'].astype(int).as_matrix()
# Create a figure
p11 = figure(tools=TOOLS,
title="Overall Points v Score4 - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p11.scatter(x,
y,
fill_color=colorsF,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p11)
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score5'].astype(int).as_matrix()
# Create a figure
p13 = figure(tools=TOOLS,
title="Overall Points v Score5 - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p13.scatter(x,
y,
fill_color=colorsF,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p13)
# Scatter Plots -> Score - Weight
# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score6'].astype(int).as_matrix()
# Create a figure
p14 = figure(tools=TOOLS,
title="Overall Points v Score6 - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p14.scatter(x,
y,
fill_color=colorsM,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p14)
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score6'].astype(int).as_matrix()
# Create a figure
p15 = figure(tools=TOOLS,
title="Overall Points v Score6 - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p15.scatter(x,
y,
fill_color=colorsF,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p15)
# Scatter Plots -> Weight - Height
# Define Ranges
y = Data_CF_VA_Male['Height_cm'].astype(int).as_matrix()
x = Data_CF_VA_Male['Weight_kg'].astype(int).as_matrix()
# Create a figure
p16 = figure(tools=TOOLS,
title="Weight v height - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p16.scatter(x,
y,
fill_color=colorsM,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p16)
# Define Ranges
y = Data_CF_VA_Female['Height_cm'].astype(int).as_matrix()
x = Data_CF_VA_Female['Weight_kg'].astype(int).as_matrix()
# Create a figure
p17 = figure(tools=TOOLS,
title="Weight v Height - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p17.scatter(x,
y,
fill_color=colorsF,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p17)
# PHASE 3 - Describing the relationships
# Get the scores into a seperate Dataframe
Data_CF_VA_Scores_Male = Data_CF_VA_Male[['overall-rank',
' score1',
' score2',
' score3',
' score4',
' score5',
' score6',
'Weight_kg',
'age',
'Height_cm']]
Data_CF_VA_Scores_Female = Data_CF_VA_Female[['overall-rank',
' score1',
' score2',
' score3',
' score4',
' score5',
' score6',
'Weight_kg',
'age',
'Height_cm']]
# Correlation Plot - Male
seaborn.corrplot(Data_CF_VA_Scores_Male.dropna())
# Correlation Plot - Female
seaborn.corrplot(Data_CF_VA_Scores_Female.dropna())
# PHASE 4 - Exploring the findings iteratively through interaction of Computational Techniques
# Autoencoder - Dimensionality Reduction
# Prepare the Data - Normalise
# Train the Neural Network
# Reconstruct to Test Performance of the Autoencoder
# Encoder to get two features
# Export Data to R and Use h2o with Tableau
# An overview of Autoencoder
# Reasoning for use:
# Advantage of using PCA is restricted to the linearity assumption, whereas an auto encoders can have nonlinear enoder/decoders.
print('An Example of an Autoencoder is shown below\n')
print('In order to use this Autoencoder the following steps are:\n')
print('1 - Training the Networks - where the Input layer is the same as the output layer\n')
print('2 - Access the Reconstruction Error - Ensure the error is low before moving on\n')
print('3 - Encoding - Encoder the data on both the Male and Females datasets')
from IPython.display import Image
Image(filename='autoencoder.png')
# Denoising Autoencoder - To be used for Dimensionality Reduction
"""
Denoising Autoencoders (dA)
References :
- P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and
Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
2008
- DeepLearningTutorials
https://github.com/lisa-lab/DeepLearningTutorials
- Yusuke Sugomori: Stochastic Gradient Descent for Denoising Autoencoders,
http://yusugomori.com/docs/SGD_DA.pdf
"""
import sys
import numpy
numpy.seterr(all='ignore')
def sigmoid(x):
return 1. / (1 + numpy.exp(-x))
class dA(object):
def __init__(self, input=None, n_visible=2, n_hidden=3, \
W=None, hbias=None, vbias=None, numpy_rng=None):
self.n_visible = n_visible # num of units in visible (input) layer
self.n_hidden = n_hidden # num of units in hidden layer
if numpy_rng is None:
numpy_rng = numpy.random.RandomState(1234)
if W is None:
a = 1. / n_visible
initial_W = numpy.array(numpy_rng.uniform( # initialize W uniformly
low=-a,
high=a,
size=(n_visible, n_hidden)))
W = initial_W
if hbias is None:
hbias = numpy.zeros(n_hidden) # initialize h bias 0
if vbias is None:
vbias = numpy.zeros(n_visible) # initialize v bias 0
self.numpy_rng = numpy_rng
self.x = input
self.W = W
self.W_prime = self.W.T
self.hbias = hbias
self.vbias = vbias
# self.params = [self.W, self.hbias, self.vbias]
def get_corrupted_input(self, input, corruption_level):
assert corruption_level < 1
return self.numpy_rng.binomial(size=input.shape,
n=1,
p=1-corruption_level) * input
# Encode
def get_hidden_values(self, input):
return sigmoid(numpy.dot(input, self.W) + self.hbias)
# Decode
def get_reconstructed_input(self, hidden):
return sigmoid(numpy.dot(hidden, self.W_prime) + self.vbias)
def train(self, lr=0.1, corruption_level=0.3, input=None):
if input is not None:
self.x = input
x = self.x
tilde_x = self.get_corrupted_input(x, corruption_level)
y = self.get_hidden_values(tilde_x)
z = self.get_reconstructed_input(y)
L_h2 = x - z
L_h1 = numpy.dot(L_h2, self.W) * y * (1 - y)
L_vbias = L_h2
L_hbias = L_h1
L_W = numpy.dot(tilde_x.T, L_h1) + numpy.dot(L_h2.T, y)
self.W += lr * L_W
self.hbias += lr * numpy.mean(L_hbias, axis=0)
self.vbias += lr * numpy.mean(L_vbias, axis=0)
def negative_log_likelihood(self, corruption_level=0.3):
tilde_x = self.get_corrupted_input(self.x, corruption_level)
y = self.get_hidden_values(tilde_x)
z = self.get_reconstructed_input(y)
cross_entropy = - numpy.mean(
numpy.sum(self.x * numpy.log(z) +
(1 - self.x) * numpy.log(1 - z),
axis=1))
return cross_entropy
def reconstruct(self, x):
y = self.get_hidden_values(x)
z = self.get_reconstructed_input(y)
return z
def test_dA(learning_rate=0.1, corruption_level=0.1, training_epochs=500):
data = numpy.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0]])
rng = numpy.random.RandomState(123)
# construct dA
da = dA(input=data, n_visible=20, n_hidden=2, numpy_rng=rng)
# train
for epoch in xrange(training_epochs):
da.train(lr=learning_rate, corruption_level=corruption_level)
# cost = da.negative_log_likelihood(corruption_level=corruption_level)
# print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost
# learning_rate *= 0.95
# test
x = numpy.array([[1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0]])
print da.reconstruct(x)
# Main
if __name__ == "__main__":
#test_dA()
print('Autoencoder Class built - Ready for testing')
# Prepare the Data
data_M_numerical = Data_CF_VA_Scores_Male.as_matrix()
data_F_numerical = Data_CF_VA_Scores_Female.as_matrix()
# Train the Neural Network - Male
# Define Parameters
learning_rate = 0.001
corruption_level = 0.3
training_epochs = 5000
# Training
Male_Autoencoder = dA(input = data_M_numerical,
n_visible = data_M_numerical.shape[1],
n_hidden = 2)
# Train the Neural Network - Female
# Define Parameters
learning_rate = 0.001
corruption_level = 0.3
training_epochs = 5000
# Training
Female_Autoencoder = dA(input = data_F_numerical,
n_visible = data_F_numerical.shape[1],
n_hidden = 2)
Mcost = Male_Autoencoder.negative_log_likelihood(corruption_level=corruption_level)
print('Male Autoencoder Cost Value: %.4f') % (Mcost)
Fcost = Female_Autoencoder.negative_log_likelihood(corruption_level)
print('Female Autoencoder Cost Value: %.4f') % (Fcost)
# Encoding the data
Male_Autoencoder_Hidden_Values = pd.DataFrame(Male_Autoencoder.get_hidden_values(data_M_numerical),
columns = ['Autoencoder_1','Autoencoder_2'])
Female_Autoencoder_Hidden_Values = pd.DataFrame(Female_Autoencoder.get_hidden_values(data_F_numerical),
columns = ['Autoencoder_1','Autoencoder_2'])
# Get the missing Data
Merged_Data = Data_CF[['athlete_ID',
'nameURL',
'First_Name',
'Last_Name',
'Region',
'sex&division',
'Gender']]
# Merge Data togeather - Male
Data_CF_VA_Scores_Male = Data_CF_VA_Scores_Male.merge(Merged_Data,
how = 'left',
left_index = True,
right_index = True)
# Merge Data togeather - Female
Data_CF_VA_Scores_Female = Data_CF_VA_Scores_Female.merge(Merged_Data,
how = 'left',
left_index = True,
right_index = True)
Data_CF_VA_Scores_Male.describe().T
# Save Datasets
Data_CF_VA_Scores_Male.to_csv('Subsetted_Male_Dataset.csv',
sep = ',',
index = False)
Data_CF_VA_Scores_Female.to_csv('Subsetted_Female_Dataset.csv',
sep = ',',
index = False)
Male_Autoencoder_Hidden_Values.to_csv('Autoencoder_Male_Dataset.csv',
sep = ',',
index = False)
Female_Autoencoder_Hidden_Values.to_csv('Autoencoder_Female_Dataset.csv',
sep = ',',
index = False)
Data_CF_VA_Scores_Female.describe().T
# PHASE 5 - Final Enhancements and Visulisations
# The final phase will involve using h2o and Tableau for more visulisations.
Image(filename='workflow.png')
# Time to Process
print('Time to Process Script: %.5f Seconds') % (time.time() - start)